import pandas as pd
import networkx as nx
import requests
df = pd.read_csv('singarenSFlow-21-02-17-140301.csv', names=['Type', 'SFlowAgentAddr', 'InputPort', 'OutputPort', 'srcMAC', 'dstMAC', 'EthernetType', 'InVLAN', 'OutVLAN', 'SrcIP', 'DstIP', 'IPProtocol', 'IpTOS', 'IpTTL', 'UDPSrcPort', 'UDPDstPort', 'TCPFlags', 'PacketSize', 'IPSize', 'SamplingRate'], index_col=False)
df
| Type | SFlowAgentAddr | InputPort | OutputPort | srcMAC | dstMAC | EthernetType | InVLAN | OutVLAN | SrcIP | DstIP | IPProtocol | IpTOS | IpTTL | UDPSrcPort | UDPDstPort | TCPFlags | PacketSize | IPSize | SamplingRate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FLOW | 203.30.38.251 | 129 | 193 | 00135f21bc80 | 0031466b23cf | 0x0800 | 11.0 | 919 | 155.69.160.77 | 74.125.130.141 | 6 | 0x60 | 59 | 57577.0 | 443 | 0x10 | 74 | 52 | 2048 |
| 1 | FLOW | 203.30.38.251 | 129 | 193 | 00135f21bc80 | 0031466b23cf | 0x0800 | 11.0 | 919 | 155.69.160.78 | 74.125.203.128 | 6 | 0x00 | 59 | 34809.0 | 443 | 0x10 | 1442 | 1420 | 2048 |
| 2 | FLOW | 203.30.38.251 | 131 | 193 | 001cb0c88e40 | 0031466b23cf | 0x0800 | 43.0 | 919 | 192.122.131.134 | 74.125.10.12 | 6 | 0x00 | 57 | 50549.0 | 443 | 0x10 | 102 | 80 | 2048 |
| 3 | FLOW | 203.30.38.251 | 135 | 129 | 002688cd5fc7 | 00135f21bc80 | 0x0800 | 919.0 | 11 | 54.169.229.179 | 155.69.191.254 | 17 | 0x00 | 57 | 16285.0 | 39641 | 0x10 | 580 | 562 | 2048 |
| 4 | FLOW | 203.30.38.251 | 193 | 130 | 0031466b23cf | 00239cd087c1 | 0x0800 | 919.0 | 919 | 209.85.229.247 | 137.132.228.34 | 17 | 0x00 | 63 | 443.0 | 32817 | 0x00 | 1396 | 1378 | 2048 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 203998 | FLOW | 203.30.38.251 | 129 | 193 | 00135f21bc80 | 0031466b23cf | 0x0800 | 11.0 | 919 | 155.69.203.215 | 74.125.10.7 | 6 | 0x00 | 122 | 55177.0 | 443 | 0x10 | 62 | 40 | 2048 |
| 203999 | FLOW | 203.30.38.251 | 193 | 130 | 0031466b23cf | 00239cd087c1 | 0x0800 | 919.0 | 919 | 173.194.22.204 | 137.132.17.41 | 17 | 0x00 | 63 | 443.0 | 54938 | 0x00 | 1396 | 1378 | 2048 |
| 204000 | FLOW | 203.30.38.251 | 200 | 130 | 80711fc76001 | 00239cd087c1 | 0x0800 | 280.0 | 919 | 169.229.205.229 | 137.132.22.254 | 17 | 0x00 | 247 | 443.0 | 3090 | 0x00 | 1487 | 1465 | 2048 |
| 204001 | FLOW | 203.30.38.251 | 193 | 130 | 0031466b23cf | 00239cd087c1 | 0x0800 | 919.0 | 919 | 74.125.10.40 | 137.132.188.86 | 6 | 0x00 | 63 | 443.0 | 54901 | 0x10 | 1438 | 1420 | 2048 |
| 204002 | FLOW | 203.30.38.251 | 193 | 130 | 0031466b23cf | 00239cd087c1 | 0x0800 | 919.0 | 919 | 173.194.22.204 | 137.132.17.41 | 17 | 0x00 | 63 | 443.0 | 54938 | 0x10 | 1396 | 1378 | 2048 |
204003 rows × 20 columns
import json
def get_org(ip):
url = "https://ipapi.co/" + ip + "/json/"
resp = requests.get(url).text
try:
jsonObj = json.loads(resp)
return jsonObj['org']
except:
return ''
top_talkers = pd.DataFrame(df['SrcIP'].value_counts()).reset_index()
top_talkers.columns = ['SrcIP', 'SrcIP_Packet_Count']
top_talkers = top_talkers.nlargest(5, 'SrcIP_Packet_Count')
top_talkers['Organisation'] = top_talkers['SrcIP'].apply(get_org)
top_talkers
| SrcIP | SrcIP_Packet_Count | Organisation | |
|---|---|---|---|
| 0 | 103.26.47.233 | 9646 | Malaysian Research & Education Network |
| 1 | 13.107.4.50 | 4950 | MICROSOFT-CORP-MSN-AS-BLOCK |
| 2 | 155.69.160.78 | 4563 | Nanyang Technological University |
| 3 | 130.14.250.7 | 3914 | NLM-GW |
| 4 | 173.194.22.215 | 2896 |
top_listeners = pd.DataFrame(df['DstIP'].value_counts()).reset_index()
top_listeners.columns = ['DstIP', 'DstIP_Packet_Count']
top_listeners = top_listeners.nlargest(5, 'DstIP_Packet_Count')
top_listeners['Organisation'] = top_listeners['DstIP'].apply(get_org)
top_listeners
| DstIP | DstIP_Packet_Count | Organisation | |
|---|---|---|---|
| 0 | 103.22.221.73 | 9646 | National Infomation Society Agency |
| 1 | 137.132.228.33 | 7835 | NUS Information Technology |
| 2 | 137.132.228.29 | 5964 | NUS Information Technology |
| 3 | 137.132.228.42 | 4987 | NUS Information Technology |
| 4 | 103.37.198.100 | 3915 | A-STAR |
allProtocols = df.IPProtocol.unique()
allProtocols
array([ 6, 17, 47, 50, 0, 1, 41, 58, 2, 89], dtype=int64)
ip_protocol_dict = {
50: "ESP",
6: "TCP",
17: "UDP",
0: "HOPOPT",
47: "GRE",
41: "IPv6",
1: "ICMP",
381: "Reserved",
58: "IGNORE",
2: "IGNORE",
89: "IGNORE"
}
selected_protocols = ["TCP", "UDP"]
total_count = len(df)
for protocol in allProtocols:
if ip_protocol_dict[protocol] in selected_protocols:
print(ip_protocol_dict[protocol])
packetsWithSameProtocol = df[df["IPProtocol"] == protocol]
count = len(packetsWithSameProtocol)
percentage = count / total_count * 100
print("Count: ", count)
print("Percentage: ", percentage)
TCP Count: 155799 Percentage: 76.3709357215335 UDP Count: 45377 Percentage: 22.243300343622398
destination_df = pd.DataFrame(df["UDPDstPort"].value_counts()).reset_index()
destination_df.columns = ["Destination_IP_Port_Number", "Packet_Count"]
destination_df.sort_values(by=["Packet_Count"])
destination_df
| Destination_IP_Port_Number | Packet_Count | |
|---|---|---|
| 0 | 443 | 42975 |
| 1 | 80 | 11960 |
| 2 | 56800 | 3918 |
| 3 | 15000 | 2697 |
| 4 | 44678 | 1158 |
| ... | ... | ... |
| 21913 | 1848 | 1 |
| 21914 | 3897 | 1 |
| 21915 | 10044 | 1 |
| 21916 | 960027827 | 1 |
| 21917 | 34799 | 1 |
21918 rows × 2 columns
sampling_rate = 1/1000
total_size_bits = df['IPSize'].sum()
total_size_megabyte = total_size_bits / 1024 / 1024 / 8
total_traffic = total_size_megabyte / sampling_rate
print(f"Total Traffic: {total_traffic} MB")
Total Traffic: 23742.154479026794 MB
comm_pair_df = df.groupby(['SrcIP', 'DstIP']).size().reset_index(name='Communication_Count')
comm_pair_df_top5 = comm_pair_df.nlargest(5, 'Communication_Count').reset_index()
comm_pair_df_top5
| index | SrcIP | DstIP | Communication_Count | |
|---|---|---|---|---|
| 0 | 199 | 103.26.47.233 | 103.22.221.73 | 9646 |
| 1 | 3758 | 130.14.250.7 | 103.37.198.100 | 3914 |
| 2 | 3450 | 129.99.230.54 | 137.132.22.74 | 2689 |
| 3 | 638 | 104.44.201.147 | 202.21.159.244 | 2454 |
| 4 | 15296 | 202.21.159.244 | 104.44.201.147 | 2154 |
ip_pairs = dict()
for _, r in df.iterrows():
pair1 = r['SrcIP'] + " & " + r['DstIP']
pair2 = r['DstIP'] + " & " + r['SrcIP']
if pair1 in ip_pairs.keys():
ip_pairs[pair1] += 1
elif pair2 in ip_pairs.keys():
ip_pairs[pair2] += 1
else:
ip_pairs[pair2] = 1
sorted_pairs = sorted([(k,v) for k,v in ip_pairs.items()], key=lambda x: x[1], reverse=True)
comm_pair_undirected_df = pd.DataFrame.from_dict(sorted_pairs)
comm_pair_undirected_df.columns = ["IP_Pair", "Communication_Count"]
comm_pair_undirected_df[['SrcIP','DstIP']] = comm_pair_undirected_df['IP_Pair'].str.split(" & ", expand=True)
comm_pair_undirected_df.drop(columns = 'IP_Pair', inplace = True)
comm_pair_undirected_df = comm_pair_undirected_df[['SrcIP','DstIP','Communication_Count']]
comm_pair_undirected_df
| SrcIP | DstIP | Communication_Count | |
|---|---|---|---|
| 0 | 103.22.221.73 | 103.26.47.233 | 11092 |
| 1 | 104.44.201.147 | 202.21.159.244 | 4608 |
| 2 | 103.37.198.100 | 130.14.250.7 | 4358 |
| 3 | 137.132.22.74 | 129.99.230.54 | 3203 |
| 4 | 155.69.52.27 | 128.117.28.212 | 1572 |
| ... | ... | ... | ... |
| 21967 | 155.69.160.78 | 52.219.36.24 | 1 |
| 21968 | 137.132.232.42 | 209.85.229.202 | 1 |
| 21969 | 216.58.221.78 | 155.69.96.21 | 1 |
| 21970 | 203.117.152.202 | 202.6.241.96 | 1 |
| 21971 | 104.66.0.35 | 155.69.218.47 | 1 |
21972 rows × 3 columns
vis_df = comm_pair_df.copy()
vis_df
| SrcIP | DstIP | Communication_Count | |
|---|---|---|---|
| 0 | - | - | 3 |
| 1 | 0 | 0 | 90 |
| 2 | 0.0.0.0 | 255.255.255.255 | 1 |
| 3 | 1.179.247.15 | 220.156.177.1 | 1 |
| 4 | 1.193.219.18 | 123.136.64.30 | 1 |
| ... | ... | ... | ... |
| 24933 | fe80:0000:0000:0000:6600:f1ff:fec6:1e10 | ff02:0000:0000:0000:0000:0000:0000:0005 | 1 |
| 24934 | fe80:0000:0000:0000:69bd:5711:b817:c9d8 | ff02:0000:0000:0000:0000:0000:0000:0016 | 1 |
| 24935 | fe80:0000:0000:0000:69bd:5711:b817:c9d8 | ff02:0000:0000:0000:0000:0000:0001:0003 | 2 |
| 24936 | fe80:0000:0000:0000:69bd:5711:b817:c9d8 | ff02:0000:0000:0000:0000:0001:ff9c:6300 | 1 |
| 24937 | fe80:0000:0000:0000:ae4b:c800:6883:67c0 | ff02:0000:0000:0000:0000:0001:ff00:0040 | 1 |
24938 rows × 3 columns
import networkx as nx
graph = nx.from_pandas_edgelist(vis_df, 'SrcIP', 'DstIP', edge_attr='Communication_Count')
# nx.draw_spring(graph)
import matplotlib.pyplot as plt
plt.figure(figsize=(100,100))
positions = nx.spring_layout(graph, weight="Communication_Count")
nodes = nx.draw_networkx_nodes(graph, positions, node_size=1000, node_color='b', node_shape='s')
edges = nx.draw_networkx_edges(graph, positions, width=8)
plt.show()
vis_df_undirected = comm_pair_undirected_df.copy()
graph_undirected = nx.from_pandas_edgelist(vis_df_undirected, 'SrcIP', 'DstIP', edge_attr='Communication_Count')
plt.figure(figsize=(100,100))
positions = nx.spring_layout(graph_undirected, weight="Communication_Count")
nodes = nx.draw_networkx_nodes(graph_undirected, positions, node_size=1000, node_color='b', node_shape='s')
edges = nx.draw_networkx_edges(graph_undirected, positions, width=8)
plt.show()
largest_sent_nodes = pd.DataFrame(df.groupby("SrcIP")["PacketSize"].sum())
largest_sent_nodes.sort_values(by=['PacketSize'], ascending=False)
| PacketSize | |
|---|---|
| SrcIP | |
| 9745144 | 45986236 |
| 9745071 | 45985937 |
| 9744997 | 45985638 |
| 9744926 | 45985297 |
| 9744852 | 45985008 |
| ... | ... |
| 224638 | 0 |
| 224593 | 0 |
| 224595 | 0 |
| 224637 | 0 |
| 224655 | 0 |
7862 rows × 1 columns